Load R packages

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)


Preparing the data


Dataset downloaded from mgandal’s github repository.


Load data and perform some initial transformations of the variables

# Load csvs
datExpr = read.csv('./../Data/inputData/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/inputData/RNAseq_ASD_datMeta.csv')

# 1. Group brain regions by lobes
# 2. Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers, 
#    and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
# 3. Transform Diagnosis into a factor variable with the first level being the control group
datMeta = datMeta %>% mutate(Brain_Region = Region %>% as.factor) %>% 
                      mutate(Brain_lobe = ifelse(Brain_Region %in% c('BA4_6','BA9','BA24','BA44_45'), 'Frontal',
                                          ifelse(Brain_Region %in% c('BA3_1_2_5', 'BA7'), 'Parietal',
                                          ifelse(Brain_Region %in% c('BA38','BA39_40','BA20_37','BA41_42_22'), 
                                                 'Temporal',
                                          'Occipital')))) %>%
                      mutate(Batch = gsub('/', '.', RNAExtractionBatch %>% as.factor), 
                             Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>% 
                      dplyr::select(-Diagnosis_)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

# NCBI biotype annotation
NCBI_biotype = read.csv('./../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))

Check sample composition


Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).


The dataset includes 63682 genes from 88 samples belonging to 41 different subjects


Counts distribution: More than half of the counts are zero and most of the counts are relatively low, but there are some very high outliers

counts = datExpr %>% melt

count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
                         'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
                                      mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
                                      max(counts$value)))

count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
Statistic Values
Min 0.00
1st Quartile 0.00
Median 0.00
Mean 564.09
3rd Quartile 27.00
Max 27183314.00
rm(counts, count_distr)




2.2.1 Gene annotation


Biotype

I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:

  1. Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels

  2. Annotate genes with Biotype labels:

    2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)

    2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)

    2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys (17 genes return multiple labels)

    2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys


labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
                                      'n_matches' = rep(0,4))

########################################################################################
# 1. Query archive version

getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), values = rownames(datExpr), mart=mart) %>% 
           rename(external_gene_id = 'hgnc_symbol')
datGenes$length = datGenes$end_position - datGenes$start_position

cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 0/63677 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels

cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))

cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 42904/63677 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))

########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
               host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart, 
                         values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', 
           sum(is.na(datGenes$gene_biotype)),
           ' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9099/42904 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
           mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]

########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key

missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes=getinfo, filters=c('hgnc_symbol'), mart=mart,
                                 values=missing_genes)

cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
           length(missing_genes),
           ' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 5712/7866 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0('    ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
##     17 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes

missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), 
                                 values = missing_ensembl_ids, mart=mart)

cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
             ' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 0/6648 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# Plot results

labels_source = labels_source %>% mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),1))

p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
    theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
    axis.text.y=element_blank(), axis.ticks.y=element_blank())

ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))

########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]


rm(getinfo, mart, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
   dups, missing_ensembl_ids, missing_genes, labels_source, p)


Neuronal function

The neuronal function is obtained from the Gene Ontology annotations for each gene, defining each gene that contains the substring ‘neuro’ as having a neuronal function. The pipeline to process and clean each gene’s GO annotations can be found in ./../Data/inputData/genes_GO_annotations.csv

GO_annotations = read.csv('./../Data/inputData/genes_GO_annotations.csv')

GO_neuronal = GO_annotations %>% filter(grepl('neuro', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>% mutate('Neuronal'=1)

rm(GO_annotations)




2.2.2 Filtering



Checking how many SFARI genes are in the dataset

df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length

Considering all genes, this dataset contains 911 of the 912 SFARI genes

1.- Remove rows that don’t correspond to genes

to_keep = !is.na(datGenes$length)

Names of the rows removed: __no_feature, __ambiguous, __too_low_aQual, __not_aligned, __alignment_not_unique

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id


Removed 5 ‘genes’, 63677 remaining




2. Keep only protein coding genes

35% of the genes are protein coding genes

datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
                          kable_styling(full_width = F)
Biotypes of genes in dataset
. Freq
protein_coding 22543
lncRNA 12167
processed_pseudogene 10117
unprocessed_pseudogene 2547
1 2314
miRNA 2276
misc_RNA 2178
snRNA 2043
pseudogene 1410
snoRNA 1202
lincRNA 840
transcribed_unprocessed_pseudogene 682
rRNA_pseudogene 500
transcribed_processed_pseudogene 441
antisense 380
3 331
6 314
IG_V_pseudogene 254
IG_V_gene 179
TR_V_gene 146
transcribed_unitary_pseudogene 86
TR_J_gene 81
unitary_pseudogene 74
processed_transcript 72
sense_intronic 72
IG_D_gene 64
rRNA 49
TR_V_pseudogene 46
sense_overlapping 38
scaRNA 31
polymorphic_pseudogene 28
7 25
IG_J_gene 24
IG_C_gene 23
Mt_tRNA 22
4 17
IG_C_pseudogene 11
TEC 11
TR_C_gene 8
3prime_overlapping_ncrna 6
IG_J_pseudogene 6
ribozyme 5
TR_J_pseudogene 5
TR_D_gene 3
Mt_rRNA 2
8 1
translated_processed_pseudogene 1
translated_unprocessed_pseudogene 1
vaultRNA 1

Most of the non-protein coding genes have very low levels of expression

plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
                       'ProteinCoding' = datGenes$gene_biotype=='protein_coding',
                       'zeros' = apply(datExpr,1,function(x) 100*mean(x==0)>75))

58% of the non-protein coding genes have over 75% of their entries with a value of zero as opposed to 23% in the protein coding genes.

ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) + 
         geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))

Filtering protein coding genes, we are left with 907 SFARI Genes (we lose 4 genes)

Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out

n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length

df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>% 
       dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost Genes')  %>% kable_styling(full_width = F)
Lost Genes
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000187951 ARHGAP11B 3 lncRNA 0 2
ENSG00000251593 MSNP1AS 2 processed_pseudogene 0 12
ENSG00000233067 PTCHD1-AS 2 lncRNA 0 3
ENSG00000197558 SSPO 3 transcribed_unitary_pseudogene 0 4
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')

to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id


Removed 41134 genes. 22543 remaining




3. Remove genes with low levels of expression

\(\qquad\) 3.1 Remove genes with zero expression in all of the samples

to_keep = rowSums(datExpr) > 0

df = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
     right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums == 0 & !is.na(`gene-score`)) %>%
     arrange(`gene-score`) %>% dplyr::select(-ensembl_gene_id) %>%
     filter(!duplicated(`gene-symbol`), !`gene-symbol` %in% datGenes$hgnc_symbol[to_keep])

datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]


\(\qquad\) Removed 3368 genes. 19175 remaining

\(\qquad\) 904 SFARI genes remaining (we lost 3 genes)

n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length

df %>% dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost Genes with Top Scores')  %>% kable_styling(full_width = F)
Lost Genes with Top Scores
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000235718 MFRP 2 protein_coding 0 6
ENSG00000186393 KRT26 3 protein_coding 0 2
ENSG00000221888 OR1C1 3 protein_coding 0 3
rm(df)


\(\qquad\) 3.2 Removing genes with a high percentage of zeros


\(\qquad\) Choosing the threshold:

\(\qquad\) Criteria for selecting the percentage of zeros threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)

\(\qquad\) - On the plot I’m using the “dual” of the maximum percentage of zeros, the minimum percentage of non zeros so the visualisation is more intuitive

\(\qquad\) - 75% seems to be a good threshold for the minimum percentage of non zeros, so 25% will be the maximum percentage of zeros allowed in a row

\(\qquad\) - The Mean vs SD plot doesn’t show all of the genes, a random sample was selected for the genes with higher level of expression so the visualisation wouldn’t be as heavy (and since we care about the genes with the lowest levels of expression, we aren’t losing important information)

\(\qquad\) - I’m only selecting a sample of the genes with higher levels of expression (>7) so the plot is not that heavy

datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = round(100*(88-c(seq(72,7,-5),5,3,2,0))/88)

for(threshold in thresholds){
  
  datMeta = datMeta_original
  datExpr = datExpr_original
  datGenes = datGenes_original
  
  to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
  datGenes = datGenes[to_keep,]
  datExpr = datExpr[to_keep,]
  
  # Filter outlier samples
  absadj = datExpr %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))
  
  to_keep = z.ku > -2
  datMeta = datMeta[to_keep,]
  datExpr = datExpr[,to_keep]
  
  rm(absadj, netsummary, ku, z.ku, to_keep)
  
  
  # Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
  counts = datExpr %>% as.matrix
  rowRanges = GRanges(datGenes$chromosome_name,
                    IRanges(datGenes$start_position, width=datGenes$length),
                    strand=datGenes$strand,
                    feature_id=datGenes$ensembl_gene_id)
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  dds = DESeqDataSet(se, design =~Diagnosis)
  
  # Perform vst
  vsd = vst(dds)
  
  datExpr_vst = assay(vsd)
  datMeta_vst = colData(vsd)
  datGenes_vst = rowRanges(vsd)
  
  rm(counts, rowRanges, se, vsd)
  
  # Save summary results in dataframe
  if(threshold == thresholds[1]){
    mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
  } else {
    new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
                                 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
    mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
  }
}

# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
            as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character

plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]

ggplotly(plot_data %>% ggplot(aes(Mean, SD)) + 
         geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) + 
         scale_x_log10() + scale_y_log10() + theme_minimal())
# Plot remaining genes
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally

ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
         theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))
rm(to_keep_1, to_keep_2, plot_data, dds, thresholds)
# Return to original variables
datExpr = datExpr_original
datGenes = datGenes_original
datMeta = datMeta_original

rm(datExpr_original, datGenes_original, datMeta_original, datExpr_vst, datGenes_vst, datMeta_vst)


\(\qquad\) Selecting a threshold of 75

# Minimum percentage of non-zero entries allowed per gene
threshold = 75

to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

Removed 3028 genes. 16147 remaining

864 SFARI genes remaining (we lost 40 genes)

n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in% rownames(datExpr)] %>% unique %>% length

rm(threshold, to_keep)


4. Remove outlier samples

\(\qquad\) 4.1 Gandal filters samples belonging to subject AN03345 without giving an explanation. Since it could have some technical problems, I will remove them as well

to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

\(\qquad\) Removed 2 samples 86 remaining


\(\qquad\) 4.2 Filter out outliers: Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

\(\qquad\) - Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples

\(\qquad\) - All the outlier samples belong to the ASD group. Apart from this they don’t seem to have any characterstic in common (different subjects, extraction batches, brain lobes, age, PMI), except for Sex (but this is probably just because of the sex bias in the dataset)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID, 
                       'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$RNAExtractionBatch,
                       'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
                       'Diagnosis'=datMeta$Diagnosis, 'PMI'=datMeta$PMI)

selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])

Outlier samples: AN01971_BA38, AN17254_BA17, AN09714_BA38, AN01093_BA7, AN02987_BA17, AN11796_BA7

to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

rm(absadj, netsummary, ku, z.ku, plot_data)

Removed 6 samples, 80 remaining

rm(to_keep)



5. Remove repeated genes

There are 15 genes with more than one ensembl ID in the dataset. I am going to remove them.

dup_gene_names = datGenes %>% filter(datGenes$hgnc_symbol %>% duplicated) %>% pull(hgnc_symbol)

datExpr = datExpr[!datGenes$hgnc_symbol %in% dup_gene_names,]
datGenes = datGenes %>% filter(!hgnc_symbol %in% dup_gene_names)

#dup_genes = datGenes$hgnc_symbol %>% duplicated
#datGenes = datGenes[!dup_genes,]
#datExpr = datExpr[!dup_genes,]

Removed 15 genes. 16117 remaining

864 SFARI genes remaining (we lost 0 genes)

rm(dup_gene_names, n_SFARI)




After filtering, the dataset consists of 16117 genes and 80 samples

Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/preprocessedData/filtered_raw_data.RData')
#load('./../Data/preprocessedData/filtered_raw_data.RData')




2.2.3 Batch effects modelling



Note: No batch correction is performed in this section, this is done after the normalisation step

According to Tackling the widespread and critical impact of batch effects in high-throughput data, technical artifacts can be an important source of variability in the data, so batch correction should be part of the standard preprocessing pipeline of gene expression data.

They say Processing group and Date of the experiment are good batch surrogates, so I’m going to see if they affect the data in any clear way to use them as surrogates.

Known sources of batch effects


Brain Bank

All the information we have is the Brain Bank, and although all the samples were obtained from the Autism Tissue Project, we don’t have any more specific information about who preprocessed each sample

table_info = datMeta %>% apply_labels(Brain_Bank = 'Brain Bank', RNAExtractionBatch = 'RNA Extraction Batch',
                                      Diagnosis = 'Diagnosis')

table_info$Brain_Bank %>% cro
 #Total 
 Brain Bank 
   ATP  80
   #Total cases  80


Date of processing

There are two different dates when the data was procesed

table_info$RNAExtractionBatch %>% cro
 #Total 
 RNA Extraction Batch 
   10/10/2014  53
   6/20/2014  27
   #Total cases  80


Luckily, there doesn’t seem to be a correlation between the batch surrogate and the objective variable, so the batch effect will not get confused with the Diagnosis effect

cro(table_info$RNAExtractionBatch, table_info$Diagnosis)
 Diagnosis 
 CTL   ASD 
 RNA Extraction Batch 
   10/10/2014  24 29
   6/20/2014  11 16
   #Total cases  35 45
rm(table_info)

*All the samples from each subject were processed on the same day

Samples don’t seem to cluster together that strongly by batch, although there does seem to be some kind of relation

h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(substring(labels(h_clusts),2), datMeta$Dissected_Sample_ID),] %>% 
            mutate('Extraction Date' = ifelse(RNAExtractionBatch=='10/10/2014', '#F8766D', '#00BFC4'),
                   'Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Gender' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Brain Lobe' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%             # Purple: young, Yellow: old
            dplyr::select(Age, `Brain Lobe`, Gender, Diagnosis, `Extraction Date`)
h_clusts %>% as.dendrogram %>% dendextend::set('labels', rep('', nrow(datMeta))) %>% 
             dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)

rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)

Comparing the mean expression of each sample by batch we can see there could be some batch effect differentiating them

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 
                          'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 
                          'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)



Unknown sources of batch effects


Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2.

Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix

counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Diagnosis)

dds = estimateSizeFactors(dds)
norm.cts = counts(dds, normalized = TRUE)

Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.

mod = model.matrix(~ Diagnosis, colData(dds))
mod0 = model.matrix(~ 1, colData(dds))
sva_fit = svaseq(norm.cts, mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  13 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0, norm.cts)

Found 13 surrogate variables

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta_sva = cbind(datMeta, sv_data)

rm(sv_data, sva_fit)


In conclusion: Date of extraction works as a surrogate for batch effect and the sva package found other 13 variables that could work as surrogates which are now included in datMeta and should be included in the DEA.





2.2.4 Normalisation, differential expression analysis and data transformation



Using DESeq2 package to perform normalisation. I chose this package over limma because limma uses the log transformed data as input instead of the raw counts and I have discovered that in this dataset, this transformation affects genes differently depending on their mean expression level, and genes with a high SFARI score are specially affected by this.

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)
# Normalisation
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_sva)
dds = DESeqDataSet(se, design = ~ Batch + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + 
                                  SV10 + SV11 + SV12 + SV13 + Diagnosis)
## Warning in DESeqDataSet(se, design = ~Batch + SV1 + SV2 + SV3 + SV4 + SV5 + :
## some variables in design formula are characters, converting to factors
# DEA
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
DE_info = results(dds, lfcThreshold=0, altHypothesis='greaterAbs')
DE_info_shrunken = lfcShrink(dds, coef='Diagnosis_ASD_vs_CTL', lfcThreshold=0, res = DE_info, quiet=TRUE)

# Data transformation
vsd = vst(dds)

datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)

rm(counts, rowRanges, se, vsd)


DEA plots


The LFC shrinkage helps to reduce the LFC of genes with very low level of expression

plotMA(DE_info, main= 'Original LFC values')

plotMA(DE_info_shrunken, main = 'Shrunken LFC values')

Even though the shrunken LFC estimates weaken the relation between mean expression and LFC, it doesn’t disappear entirely.

p1 = data.frame('ID'=rownames(datExpr_vst), 'meanExpr'=rowMeans(datExpr_vst)) %>% 
     left_join(DE_info %>% data.frame %>% mutate(ID = rownames(.), significant = padj<0.05), by='ID') %>%
     ggplot(aes(meanExpr, abs(log2FoldChange), color = significant)) + 
     geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.01, aes(color=significant)) +
     xlab('Mean Expression') + ylab('Original LFC Magnitude') +
     theme_minimal() + theme(legend.position = 'none')

p2 = plot_data = data.frame('ID'=rownames(datExpr_vst), 'meanExpr'=rowMeans(datExpr_vst)) %>% 
     left_join(DE_info_shrunken %>% data.frame %>% mutate(ID = rownames(.), significant = padj<0.05), by='ID') %>% 
     ggplot(aes(meanExpr, abs(log2FoldChange), color = significant)) + 
     geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.2) + xlab('Mean Expression') + 
     ylab('Shrunken LFC Magnitude') + theme_minimal() + labs(color = 'DE')

ggarrange(p1,p2, nrow = 1, common.legend = TRUE, legend = 'bottom')


Data transformation plots


Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic

meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()

Plotting points individually we can notice a small heteroscedasticity in the data survived the transformation

plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)



Rename normalised datasets to continue working with these

datExpr = datExpr_vst
datMeta = datMeta_vst %>% data.frame
datGenes = datGenes_vst %>% data.frame %>% mutate(hgnc_symbol = datGenes$hgnc_symbol)
rownames(datGenes) = rownames(datExpr)

rm(datExpr_vst, datMeta_vst, datGenes_vst, datMeta_sva)





2.2.5 Batch Effects Correction



By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data

Unknown sources of batch effects


In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.

Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:

  • Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)

  • But caution should be exercised to avoid removing biological signal of interest

  • We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective

  • Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis, colData(dds))
svs = datMeta %>% dplyr::select(SV1:SV13) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp

rm(correctDatExpr)
Samples

Removing batch effects has a big impact in the distribution of the samples, separating them by diagnosis almost perfectly just using the first principal component

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
         geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)


Genes

It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)

*Plot is done with only 10% of the genes so it’s not that heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + 
         geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)

Everything looks good, so we’re keeping the corrected expression dataset

datExpr = datExpr_corrected

rm(datExpr_corrected)


Known sources of batch effects


Even after correcting the dataset for the surrogate variables found with sva, there is still a difference in mean expression by processing date (although this difference is relatively small)

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 
                          'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 
                          'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)


Performing Batch Correction for processing date


https://support.bioconductor.org/p/50983/

datExpr = datExpr %>% as.matrix %>% ComBat(batch=datMeta$Batch)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

Now both batches have the same mean expression

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']),
                          'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']),
                          'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)




Save preprocessed dataset

# Join original and shrunken LFC values
genes_info = DE_info %>% data.frame %>% 
             cbind(DE_info_shrunken %>% data.frame %>% dplyr::select(log2FoldChange, lfcSE) %>% 
                   dplyr::rename('shrunken_log2FoldChange' = log2FoldChange, 'shrunken_lfcSE' = lfcSE)) %>%
             mutate(significant=padj<0.05 & !is.na(padj)) %>% 
             add_column('ID'=rownames(DE_info), .before='baseMean') %>% 
             left_join(GO_neuronal, by='ID') %>%
             mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal))

save(datExpr, datMeta, datGenes, genes_info, dds, file='./../Data/preprocessedData/preprocessed_data.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0            knitr_1.32                 
##  [3] expss_0.10.2                dendextend_1.13.4          
##  [5] vsn_3.52.0                  WGCNA_1.69                 
##  [7] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
##  [9] sva_3.32.1                  genefilter_1.66.0          
## [11] mgcv_1.8-31                 nlme_3.1-147               
## [13] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [15] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [17] matrixStats_0.56.0          Biobase_2.44.0             
## [19] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [21] IRanges_2.18.3              S4Vectors_0.22.1           
## [23] BiocGenerics_0.30.0         biomaRt_2.40.5             
## [25] ggpubr_0.2.5                magrittr_2.0.1             
## [27] ggExtra_0.9                 GGally_1.5.0               
## [29] gridExtra_2.3               viridis_0.5.1              
## [31] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [33] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [35] glue_1.4.2                  reshape2_1.4.4             
## [37] forcats_0.5.0               stringr_1.4.0              
## [39] dplyr_1.0.1                 purrr_0.3.4                
## [41] readr_1.3.1                 tidyr_1.1.0                
## [43] tibble_3.0.3                ggplot2_3.3.2              
## [45] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] crosstalk_1.1.0.1      digest_0.6.27          foreach_1.5.0         
##  [10] htmltools_0.5.1.1      GO.db_3.8.2            fansi_0.4.1           
##  [13] checkmate_2.0.0        memoise_1.1.0          doParallel_1.0.15     
##  [16] cluster_2.1.0          limma_3.40.6           annotate_1.62.0       
##  [19] modelr_0.1.6           prettyunits_1.1.1      jpeg_0.1-8.1          
##  [22] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [25] haven_2.2.0            xfun_0.22              hexbin_1.28.1         
##  [28] crayon_1.3.4           RCurl_1.98-1.2         jsonlite_1.7.2        
##  [31] impute_1.58.0          iterators_1.0.12       survival_3.2-7        
##  [34] gtable_0.3.0           zlibbioc_1.30.0        XVector_0.24.0        
##  [37] webshot_0.5.2          scales_1.1.1           DBI_1.1.0             
##  [40] miniUI_0.1.1.1         Rcpp_1.0.6             xtable_1.8-4          
##  [43] progress_1.2.2         htmlTable_1.13.3       foreign_0.8-76        
##  [46] bit_4.0.4              preprocessCore_1.46.0  Formula_1.2-3         
##  [49] htmlwidgets_1.5.1      httr_1.4.2             acepack_1.4.1         
##  [52] ellipsis_0.3.1         farver_2.0.3           pkgconfig_2.0.3       
##  [55] reshape_0.8.8          XML_3.99-0.3           nnet_7.3-14           
##  [58] dbplyr_1.4.2           locfit_1.5-9.4         labeling_0.3          
##  [61] tidyselect_1.1.0       rlang_0.4.10           later_1.1.0.1         
##  [64] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [67] tools_3.6.3            cli_2.0.2              generics_0.0.2        
##  [70] RSQLite_2.2.0          broom_0.7.0            evaluate_0.14         
##  [73] fastmap_1.0.1          yaml_2.2.1             bit64_4.0.5           
##  [76] fs_1.5.0               mime_0.10              xml2_1.2.5            
##  [79] compiler_3.6.3         rstudioapi_0.11        curl_4.3              
##  [82] png_0.1-7              affyio_1.54.0          ggsignif_0.6.0        
##  [85] reprex_0.3.0           geneplotter_1.62.0     stringi_1.5.3         
##  [88] highr_0.8              lattice_0.20-41        Matrix_1.2-18         
##  [91] vctrs_0.3.2            pillar_1.4.6           lifecycle_0.2.0       
##  [94] BiocManager_1.30.10    cowplot_1.0.0          data.table_1.12.8     
##  [97] bitops_1.0-6           httpuv_1.5.5           affy_1.62.0           
## [100] R6_2.5.0               latticeExtra_0.6-29    promises_1.2.0.1      
## [103] codetools_0.2-16       assertthat_0.2.1       withr_2.2.0           
## [106] GenomeInfoDbData_1.2.1 hms_0.5.3              grid_3.6.3            
## [109] rpart_4.1-15           rmarkdown_2.7          shiny_1.4.0.2         
## [112] lubridate_1.7.4        base64enc_0.1-3